import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
veh= pd.read_csv("vehicle-1.csv") # Loading the dataset.
veh.shape # shape od the dataset.
veh.head() # to get a general idea of data.
veh.info() #Identifying datatype of each columns.
veh.describe()
veh.isnull().values.any() # To check if there are null values.
labelencoder = LabelEncoder()
veh['class_dup'] = labelencoder.fit_transform(veh['class'])
veh
In the above step, The class was a categorical column. \ This can't be used in SVM algorithms. Thus the columns elements were given number. \ Such as bus is 0, Van is 2 and car is 1.
veh.mean()
veh.drop("class",axis=1,inplace=True)
meanFiller = lambda x: x.fillna(x.mean())
veh = veh.apply(meanFiller,axis=0)
In the above step null values were replaced with means values of there respective columns \ and the class clomn was dropped.
sns.distplot(veh["compactness"])
sns.distplot(veh["circularity"])
sns.distplot(veh["distance_circularity"])
sns.distplot(veh["radius_ratio"])
sns.distplot(veh["pr.axis_aspect_ratio"])
sns.distplot(veh["max.length_aspect_ratio"])
sns.distplot(veh["scatter_ratio"])
sns.distplot(veh["elongatedness"])
sns.distplot(veh["pr.axis_rectangularity"])
sns.distplot(veh["scaled_variance"])
sns.distplot(veh["scaled_variance.1"])
sns.distplot(veh["scaled_radius_of_gyration"])
sns.distplot(veh["scaled_radius_of_gyration.1"])
sns.distplot(veh["skewness_about"])
sns.distplot(veh["skewness_about.1"])
sns.distplot(veh["skewness_about.2"])
sns.distplot(veh["hollows_ratio"])
sns.pairplot(veh,hue='class_dup',diag_kind='kde')
core=veh.corr()
veh.corr()
plt.figure(figsize = (21,15))
sns.heatmap(core,annot=True)
From the bivariate analysis, correlation table and correlation heatmap.\ The columns the is of least significance to target variable was found.
veh.drop("compactness",axis=1,inplace=True)
veh.drop("distance_circularity",axis=1,inplace=True)
veh.drop("pr.axis_aspect_ratio",axis=1,inplace=True)
veh.drop("max.length_rectangularity",axis=1,inplace=True)
veh.drop("scatter_ratio",axis=1,inplace=True)
veh.drop("pr.axis_rectangularity",axis=1,inplace=True)
veh.drop("skewness_about.2",axis=1,inplace=True)
X= veh.drop(["class_dup"],axis=1,)
Y= veh[["class_dup"]]
XScaled = X.apply(zscore)
x_train, x_test, y_train, y_test = train_test_split(XScaled, Y, test_size=0.3, random_state=7)
clf = SVC(C= .1, kernel='linear', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
clf = SVC(C= .1, kernel='rbf', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
clf = SVC(C= .1, kernel='poly', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
knnm=metrics.confusion_matrix(y_test,y_pred)
knn_m = pd.DataFrame(knnm, index = [i for i in ["2","1","0"]],columns = [i for i in ["Predict 2","Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(knn_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,y_pred, labels=[2,1, 0]))
clf = SVC(C= .1, kernel='sigmoid', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
After trying the SVM algorithm in the different kernels. Following inference was made:-
- The highest model accuracy is when the algorithm is running in a polynomial kernel.
- The highest accuracy score was 91.
- Confusion matrix and classification reports were found for the same.
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
clf= SVC(C= .1, kernel='poly', gamma= 1)
model=clf.fit(x_train , y_train.values.ravel())
results = cross_val_score(model, XScaled, Y.values.ravel() , cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
After using K-fold validation.
- The range of accuracy of the model was from 86% to 94% at a 95% confidence level.
covMatrix = np.cov(XScaled,rowvar=False)
print(covMatrix)
pca = PCA(n_components=6)
pca.fit(XScaled)
print(pca.explained_variance_)
plt.bar(list(range(1,7)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
pca3 = PCA(n_components=6)
pca3.fit(XScaled)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(XScaled)
Xpca3
Xpca3.shape
sns.pairplot(pd.DataFrame(Xpca3))
x_train, x_test, y_train, y_test = train_test_split(Xpca3, Y, test_size=0.3, random_state=7)
clf = SVC(C= .1, kernel='linear', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
clf = SVC(C= .1, kernel='rbf', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
clf = SVC(C= .1, kernel='poly', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
knnm=metrics.confusion_matrix(y_test,y_pred)
knn_m = pd.DataFrame(knnm, index = [i for i in ["2","1","0"]],columns = [i for i in ["Predict2","Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(knn_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,y_pred, labels=[2,1, 0]))
clf = SVC(C= .1, kernel='sigmoid', gamma= 1)
clf.fit(x_train , y_train.values.ravel())
y_pred = clf.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
After running the SVM model with eigenvectors in different kernels. The following was inferred.
- The maximum accuracy was again found in the polynomial kernel.
- The accuracy of this model was 83.
- A drop of around 6% was found but this expected as the columns are reduced
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
clf= SVC(C= .1, kernel='poly', gamma= 1)
model=clf.fit(x_train , y_train.values.ravel())
results = cross_val_score(model, Xpca3, Y.values.ravel() , cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
After using K-fold validation with eigen vectors.
- The range of accuracy of the model was from 80% to 87% at 95% confidence level.
Even after only selecting six column that covered 95% of data using PCA.
- The accuracy dropped from 91 to 83
- This is result was within the expected region.